import numpy as np
import pandas as pd
import scipy.stats as stats
import pickle
from decision_company import read_csv_file, df_copy, dropna, get_dummies, corr, f_oneway, avg, sem, t_ppf


atp_tennis = read_csv_file('atp_tennis.csv')

# Create a DataFrame with surface types, player rankings, and win/loss ratios
surface_data = df_copy(atp_tennis[['Surface', 'Rank_1', 'Rank_2', 'Win_Loss_Ratio_1', 'Win_Loss_Ratio_2']])

# Remove rows with missing or invalid data in the Win_Loss_Ratio columns
surface_data_clean = dropna(surface_data, subset_columns=['Win_Loss_Ratio_1', 'Win_Loss_Ratio_2'])

# One-hot encoding for surface types
surface_data_clean = get_dummies(surface_data_clean, columns=['Surface'], prefix='', prefix_sep='')

# Calculate the average player ranking and win/loss ratio for each match
surface_data_clean.loc[:, 'Avg_Rank'] = (surface_data_clean['Rank_1'] + surface_data_clean['Rank_2']) / 2
surface_data_clean.loc[:, 'Avg_Win_Loss_Ratio'] = (surface_data_clean['Win_Loss_Ratio_1'] + surface_data_clean['Win_Loss_Ratio_2']) / 2

# Create a Correlation Matrix
corr_matrix = corr(surface_data_clean[['Hard', 'Clay', 'Grass', 'Avg_Rank', 'Avg_Win_Loss_Ratio']])

# Perform Statistical Tests
anova_result_clean = f_oneway(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Hard'] == 1],
                              surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Clay'] == 1],
                              surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Grass'] == 1])

# Calculate the mean and standard error of the mean for each surface type
hard_mean = avg(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Hard'] == 1])
hard_sem = sem(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Hard'] == 1])

clay_mean = avg(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Clay'] == 1])
clay_sem = sem(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Clay'] == 1])

grass_mean = avg(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Grass'] == 1])
grass_sem = sem(surface_data_clean['Avg_Win_Loss_Ratio'][surface_data_clean['Grass'] == 1])

# Calculate the confidence intervals using the t-distribution
confidence_level = 0.95
degrees_of_freedom = len(surface_data_clean) - 1
t_critical = t_ppf((1 + confidence_level) / 2, degrees_of_freedom)

hard_ci = (hard_mean - t_critical * hard_sem, hard_mean + t_critical * hard_sem)
clay_ci = (clay_mean - t_critical * clay_sem, clay_mean + t_critical * clay_sem)
grass_ci = (grass_mean - t_critical * grass_sem, grass_mean + t_critical * grass_sem)

# Save Results
print("Correlation Matrix:\n", corr_matrix)
pickle.dump(corr_matrix,open("./ref_result/corr_matrix.pkl","wb"))
print("P-value:", anova_result_clean.pvalue)
pickle.dump(anova_result_clean.pvalue,open("./ref_result/pvalue.pkl","wb"))
print("Confidence Intervals:\n", t_critical)
pickle.dump(t_critical,open("./ref_result/t_critical.pkl","wb"))
print("Hard SurnConfidenceface:", hard_ci)
pickle.dump(hard_ci,open("./ref_result/hard_ci.pkl","wb"))
print("Clay Surface:", clay_ci)
pickle.dump(clay_ci,open("./ref_result/clay_ci.pkl","wb"))
print("Grass Surface:", grass_ci)
pickle.dump(grass_ci,open("./ref_result/grass_ci.pkl","wb"))